The purpose of this file is processing the combined data files for Summer 2022 into files that contain only valid data for analysis, excluding invalid sessions and participants
Data is imported from 2 files, indicating two levels of analysis: participants and blocks (item-level).
Note: mouse-cursor data contained in final_mouse_blocks.json file is not handled here.
#IMPORT DATA
df_participants <- fromJSON("input/su22_sgc3b_final_participants.json")
df_items <- fromJSON('input/su22_sgc3b_final_items.json')
#add term indicator
df_participants$term <- "summer22"
df_items$term <- "summer22"
#DEFINE SGC_3B validity crieria
sessions <- c('suPROLIFIC','su22sona') #SGC3B running on prolific
conditions <-c(111,121,211,221,311,321) #6 (conditions)factorial
violation_threshold = 3 #number of allowable browser violations
effort_exclusion = c("I didn't try very hard, or rushed through the questions", "I started out trying hard, but gave up at some point")
n_items = 15 #fifteen items is complete dataset per participant
#placeholder for excluding participants
ex_participants = data.frame()
note : We drop all scores calculated in the stimulus engine (except absolute score, which uses simple # strictly correct), as they are recalculate during analysis using a different MC scoring algorithm.
#create factors in PARTICIPANTS
df_participants <- df_participants %>%
mutate( #create factors and remove extraneous ""
subject=as.character(subject),
condition=as.character(condition),
pretty_condition = recode_factor(condition,
"111" = "none-control", "121" = "none-impasse",
"211" = "img-control", "221" = "img-impasse",
"311" = "ixv-control", "321" = "ixv-impasse"),
study = factor(study),
session = factor(session),
exp_id = factor(exp_id),
sona_id = as.character(sona_id),
pool = factor(pool),
mode = factor(mode),
attn_check = factor(attn_check),
status=factor(status),
term=factor(term),
gender = as.factor(gender),
age = as.integer(age),
country = gsub('"',"",country),
year = factor(schoolyear),
major = factor(major),
browser = factor(browser),
os = factor(os),
native_language = factor(language),
totaltime_m = totaltime/1000/60,
) %>% dplyr::select( #order cols
subject,
study,
condition,
pretty_condition,
session,
exp_id,
sona_id,
pool,
mode,
attn_check,
# explanation,
effort,
difficulty,
confidence,
enjoyment,
other,
age,
country,
language,
schoolyear,
major,
gender,
disability,
browser,
width,
height,
os,
starttime,
status,
term,
violations,
absolute_score,
# discriminant_score,
# tri_score,
# orth_score,
# other_score,
# blank_score,
totaltime_m
)
#NOT THAT WE DROP ALL SCORES, WHICH ARE INCORRECTLY CALCULATED IN THE stimulus engine. We do not drop the raw responses (answers)
df_items <- df_items %>%
mutate(
# subject=factor(subject),
# condition=factor(condition),
pretty_condition = recode_factor(condition,
"111" = "none-control", "121" = "none-impasse",
"211" = "img-control", "221" = "img-impasse",
"311" = "ixv-control", "321" = "ixv-impasse"),
pool=factor(pool),
mode = factor(mode),
# explicit=factor(explicit),
# impasse = factor(impasse),
# grid = factor(grid),
# mark = factor(mark),
# ixn = factor(ixn),
term=factor(term),
relation = factor(relation),
block = factor(block),
correct = factor(correct),
q=factor(q),
rt_s = rt/1000,
time_elapsed_m = time_elapsed/1000/60
) %>% dplyr::select(
subject,
study,
term,
pool,
mode,
condition,
pretty_condition,
block,
explicit,
impasse,
grid,
mark,
ixn,
gwidth,
gheight,
graph,
time_elapsed_m,
question,
relation,
q,
correct,
# discriminant,
# tri_score,
# orth_score,
# other_score,
# blank_score,
answer,
rt_s
) #WE DROP ALL SCORES BC THEY ARE RESCORED IN ANALYSIS FILE
Starting with Winter 2022, data are saved to the database even if the subject’s browser did not meet minimum specifications (at which point they are prompted to change browsers, or end the study). This allows us to learn about the browsers, screen sizes and OS that (potential) subjects are using. However, these data are not exported from the database for analysis (see flatten.js and status.js scripts). Thus, only subjects who successfully completed the entire study are included in this file.
#MANUALLY INSPECT status
df_participants %>% group_by(status) %>%
dplyr::summarize(n=n())
## # A tibble: 1 × 2
## status n
## <fct> <int>
## 1 success 54
54 successfully completed the study.
#DISCARD participants from invalid sessions
exclude_status <- df_participants %>%
filter(status != "success") %>%
mutate(reason="invalid-status")
ex_participants <- rbind(ex_participants, exclude_status)
rm(exclude_status)
df_participants <- df_participants %>%
filter( ! subject %in% ex_participants$subject)
No data need to be excluded on account of completion status.
Participants are randomly assigned to an experimental condition when starting the study. Here we validate that only conditions for the current study are included in this dataset.
#MANUALLY INSPECT conditions
df_participants %>% group_by(condition) %>%
dplyr::summarize(n=n())
## # A tibble: 3 × 2
## condition n
## <chr> <int>
## 1 211 12
## 2 221 20
## 3 321 22
Data from conditions not corresponding to valid conditions should be discarded.
#DISCARD participants from conditions invalid for this study
exclude_condition <- df_participants %>%
filter(!condition %in% conditions) %>%
mutate(reason="invalid-condition")
ex_participants <- rbind(ex_participants, exclude_condition)
rm(exclude_condition)
df_participants <- df_participants %>%
filter( ! subject %in% ex_participants$subject)
No data need to be excluded on account of condition.
The (string) session code is embedded in the URL
querystring by the experimenter to differentiate testing sessions in
SONA from demo and other environment setup tasks.
#MANUALLY INSPECT sessions
df_participants %>% group_by(session) %>%
dplyr::summarize(n=n())
## # A tibble: 1 × 2
## session n
## <fct> <int>
## 1 suPROLIFIC 54
#DISCARD participants from invalid sessions
exclude_session <- df_participants %>%
filter(!session %in% sessions) %>%
mutate(reason="invalid-session")
ex_participants <- rbind(ex_participants, exclude_session)
rm(exclude_session)
df_participants <- df_participants %>%
filter( ! subject %in% ex_participants$subject)
No participant records were excluded on account of session (ie. app testing or pilot session).
Browser interaction data is recorded by jspsych allowing us to determine if subjects violate our instructions not to leave the browser tab (or exit fullscreen mode) during test. These incidents are recorded in jspsych interaction data object, and the number of violations is counted and added to the participant data file.
Due to eccentricity of the browser events captured, 1-2 browser violations can be captured even if the subject did not leave the browser window (eg. in case of resizing window to meet minimum requirements.)
#MANUALLY INSPECT violations
df_participants %>% dplyr::group_by(violations) %>%
dplyr::summarize(n=n())
## # A tibble: 7 × 2
## violations n
## <dbl> <int>
## 1 1 33
## 2 1.5 4
## 3 2 9
## 4 2.5 1
## 5 3 5
## 6 4 1
## 7 8 1
#DISCARD participants exceeding the threshold of browser interaction violations
exclude_violations <- df_participants %>%
filter(violations > violation_threshold) %>%
mutate(reason="exceeded-violations")
ex_participants <- rbind(ex_participants, exclude_violations)
rm(exclude_violations)
df_participants <- df_participants %>%
filter( ! subject %in% ex_participants$subject)
Two participants were excluded for exceeding the maximum allowed number of browser interaction violations.
To assist in mitigating increased noise in data collected asynchronously from the UCSD student subject pool, we added explicit ratings of how much effort the participant expended on the task. This question was implemented as a multiple-choice drop-down on an ‘Effort’ page prior to the ‘Demographics’ survey at the end of the study. Subjects were given four options : (1) I tried my best on each question, (2) I tried my best on most questions, (3) I started out trying hard, but gave up at some point, (4) I didn’t try very hard, or rushed through the questions.
#MANUALLY INSPECT effort
df_participants %>% group_by(effort) %>%
dplyr::summarize(n=n())
## # A tibble: 2 × 2
## effort n
## <chr> <int>
## 1 I tried my best on each question 50
## 2 I tried my best on most questions 2
Participants answering with options I didn’t try very hard, or rushed through the questions or I started out trying hard, but gave up at some point are excluded from analysis.
#DISCARD participants who indicated they did not expend adequate effort on the study
exclude_effort <- df_participants %>%
filter(effort %in% effort_exclusion) %>%
mutate(reason="selfrated-effort")
ex_participants <- rbind(ex_participants, exclude_effort)
rm(exclude_effort)
df_participants <- df_participants %>%
filter( ! subject %in% ex_participants$subject)
No participants are excluded for low (self-rated) effort.
The 6th question in the study is non-discriminatory (can easily get correct answer regardless of strategy) and serves as an attention check question.
#MANUALLY INSPECT attention
df_participants %>% group_by(attn_check) %>%
dplyr::summarize(n=n())
## # A tibble: 2 × 2
## attn_check n
## <fct> <int>
## 1 FALSE 6
## 2 TRUE 46
Participants who answered the attention check question incorrectly should be excluded.
#DISCARD participants who indicated they did not expend adequate effort on the study
exclude_attn <- df_participants %>%
filter(attn_check == FALSE) %>%
mutate(reason="failed-attnchk")
#
# ex_participants <- rbind(ex_participants, exclude_attn)
# rm(exclude_attn)
#
# df_participants <- df_participants %>%
# filter( ! subject %in% ex_participants$subject)
No participants are excluded for failing the attention check question.
Next, we need to discard item_level data for excluded participants.
ex_items <- df_items %>%
filter (subject %in% ex_participants$subject)
df_items <- df_items %>%
filter (!subject %in% ex_participants$subject )
After all exclusions, we are left with the following number of participants per condition:
#MANUALLY INSPECT conditions
df_participants %>% group_by(condition) %>%
dplyr::summarize(n=n())
## # A tibble: 3 × 2
## condition n
## <chr> <int>
## 1 211 12
## 2 221 20
## 3 321 20
Finally, we need to validate we have a complete set of items for all valid participants.
count(df_items)[[1]] == count(df_participants)[[1]]* n_items
## [1] TRUE
#see https://cran.r-project.org/web/packages/codebook/vignettes/codebook_tutorial.html
#ADD VARIABLE METADATA
dict <- rio::import("input/dictionary_sgc3b_participants.csv", "csv") #import data dictionary
var_label(df_participants) <- dict %>% dplyr::select(VARIABLE, DESCRIPTION) %>% dict_to_list() #add variable labels
#ADD DATASET METATDATA
metadata(df_participants)$name <- "Experimental PARTICIPANTS for study SGC3B"
metadata(df_participants)$description <- "Data for study SGC3B summarized at PARTICIPANT level"
metadata(df_participants)$creator <- "Amy Rae Fox"
metadata(df_participants)$contact <- "amyraefox@gmail.com"
#{r, eval = checkMode() == "pdf"} #ONLY FOR PDF KNIT
codebook::skim_codebook(df_participants)
| Name | data |
| Number of rows | 52 |
| Number of columns | 32 |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| factor | 13 |
| numeric | 9 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| subject | 0 | 1 | 5 | 5 | 0 | 52 | 0 |
| condition | 0 | 1 | 3 | 3 | 0 | 3 | 0 |
| sona_id | 0 | 1 | 24 | 24 | 0 | 48 | 0 |
| effort | 0 | 1 | 32 | 33 | 0 | 2 | 0 |
| other | 0 | 1 | 0 | 220 | 20 | 32 | 0 |
| country | 0 | 1 | 2 | 24 | 0 | 6 | 0 |
| language | 0 | 1 | 7 | 7 | 0 | 1 | 0 |
| schoolyear | 0 | 1 | 12 | 27 | 0 | 5 | 0 |
| disability | 0 | 1 | 0 | 90 | 11 | 15 | 0 |
| starttime | 0 | 1 | 24 | 24 | 0 | 52 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| study | 0 | 1 | FALSE | 1 | SGC: 52 |
| pretty_condition | 0 | 1 | FALSE | 3 | img: 20, ixv: 20, img: 12 |
| session | 0 | 1 | FALSE | 1 | suP: 52 |
| exp_id | 0 | 1 | FALSE | 3 | 630: 20, 630: 20, 630: 12 |
| pool | 0 | 1 | FALSE | 1 | pro: 52 |
| mode | 0 | 1 | FALSE | 1 | asy: 52 |
| attn_check | 0 | 1 | FALSE | 2 | TRU: 46, FAL: 6 |
| major | 0 | 1 | FALSE | 7 | Soc: 15, Mat: 13, Fin: 7, Bio: 6 |
| gender | 0 | 1 | FALSE | 3 | Fem: 26, Mal: 21, Oth: 5 |
| browser | 0 | 1 | FALSE | 1 | chr: 52 |
| os | 0 | 1 | FALSE | 4 | Win: 27, Mac: 19, Chr: 5, Lin: 1 |
| status | 0 | 1 | FALSE | 1 | suc: 52 |
| term | 0 | 1 | FALSE | 1 | sum: 52 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | min | median | max | hist |
|---|---|---|---|---|---|---|---|---|
| difficulty | 0 | 1 | 3.33 | 1.06 | 1.00 | 3.0 | 5.00 | ▁▃▇▇▂ |
| confidence | 0 | 1 | 3.56 | 1.04 | 1.00 | 4.0 | 5.00 | ▁▂▅▇▃ |
| enjoyment | 0 | 1 | 3.75 | 1.19 | 1.00 | 4.0 | 5.00 | ▁▂▃▇▇ |
| age | 0 | 1 | 29.46 | 11.39 | 18.00 | 26.5 | 66.00 | ▇▃▂▁▁ |
| width | 0 | 1 | 1638.25 | 286.14 | 1222.00 | 1536.0 | 2498.00 | ▇▅▆▁▁ |
| height | 0 | 1 | 839.31 | 146.86 | 682.00 | 797.0 | 1369.00 | ▇▅▂▁▁ |
| violations | 0 | 1 | 1.43 | 0.66 | 1.00 | 1.0 | 3.00 | ▇▁▂▁▁ |
| absolute_score | 0 | 1 | 7.81 | 4.15 | 0.00 | 10.0 | 12.00 | ▂▂▂▂▇ |
| totaltime_m | 0 | 1 | 11.40 | 5.25 | 4.12 | 9.5 | 29.38 | ▇▅▂▂▁ |
codebook(df_participants, #ONLY FOR HTML KNIT
metadata_table = TRUE,
detailed_variables = FALSE,
detailed_scales = FALSE,
metadata_json = FALSE,
survey_overview = FALSE,
missingness_report = FALSE)
Dataset name: Experimental PARTICIPANTS for study SGC3B
Data for study SGC3B summarized at PARTICIPANT level
Date published: 2022-08-23
Creator:
| name | value |
|---|---|
| 1 | Amy Rae Fox |
|
|
#see https://cran.r-project.org/web/packages/codebook/vignettes/codebook_tutorial.html
#ADD VARIABLE METADATA
dict <- rio::import("input/dictionary_sgc3b_items.csv", "csv") #import data dictionary
var_label(df_items) <- dict %>% dplyr::select(VARIABLE, DESCRIPTION) %>% dict_to_list() #add variable labels
#ADD DATASET METATDATA
metadata(df_items)$name <- "Experimental ITEMS for study SGC3B"
metadata(df_items)$description <- "Data for study SGC3B summarized at participant-item level"
metadata(df_items)$creator <- "Amy Rae Fox"
metadata(df_items)$contact <- "amyraefox@gmail.com"
#{r, eval = checkMode() == "pdf"} #ONLY FOR PDF EXPORT
skim_codebook(df_items)
| Name | data |
| Number of rows | 780 |
| Number of columns | 23 |
| _______________________ | |
| Column type frequency: | |
| character | 8 |
| factor | 8 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| subject | 0 | 1 | 5 | 5 | 0 | 52 | 0 |
| study | 0 | 1 | 5 | 5 | 0 | 1 | 0 |
| condition | 0 | 1 | 3 | 3 | 0 | 3 | 0 |
| explicit | 0 | 1 | 1 | 1 | 0 | 3 | 0 |
| impasse | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
| graph | 0 | 1 | 10 | 10 | 0 | 1 | 0 |
| question | 0 | 1 | 26 | 87 | 0 | 15 | 0 |
| answer | 0 | 1 | 0 | 7 | 16 | 61 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| term | 0 | 1 | FALSE | 1 | sum: 780 |
| pool | 0 | 1 | FALSE | 1 | pro: 780 |
| mode | 0 | 1 | FALSE | 1 | asy: 780 |
| pretty_condition | 0 | 1 | FALSE | 3 | img: 300, ixv: 300, img: 180 |
| block | 0 | 1 | FALSE | 3 | ite: 364, ite: 260, ite: 156 |
| relation | 0 | 1 | FALSE | 10 | end: 104, mee: 104, mid: 104, sta: 104 |
| q | 0 | 1 | FALSE | 15 | 1: 52, 2: 52, 3: 52, 4: 52 |
| correct | 0 | 1 | FALSE | 2 | TRU: 520, FAL: 260 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | min | median | max | hist |
|---|---|---|---|---|---|---|---|---|
| grid | 0 | 1 | 1.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▁▁▇▁▁ |
| mark | 0 | 1 | 1.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▁▁▇▁▁ |
| ixn | 0 | 1 | 1.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▁▁▇▁▁ |
| gwidth | 0 | 1 | 600.00 | 0.00 | 600.00 | 600.00 | 600.00 | ▁▁▇▁▁ |
| gheight | 0 | 1 | 600.00 | 0.00 | 600.00 | 600.00 | 600.00 | ▁▁▇▁▁ |
| time_elapsed_m | 0 | 1 | 6.13 | 3.97 | 0.48 | 5.21 | 21.82 | ▇▆▂▁▁ |
| rt_s | 0 | 1 | 28.70 | 32.70 | 1.36 | 18.40 | 396.48 | ▇▁▁▁▁ |
codebook(df_items,#ONLY FOR HTML EXPORT
metadata_table = TRUE,
detailed_variables = FALSE,
detailed_scales = FALSE,
metadata_json = FALSE,
survey_overview = FALSE,
missingness_report = FALSE)
Dataset name: Experimental ITEMS for study SGC3B
Data for study SGC3B summarized at participant-item level
Date published: 2022-08-23
Creator:
| name | value |
|---|---|
| 1 | Amy Rae Fox |
|
|
Exploration of the distribution of key response variables for validation purposes:
NOTE:: INTERPRET WITH CAUTION. THIS FILE DOES NOT INCLUDE DATA FOR THE ENTIRE STUDY, ONLY SUBJECTS RUN IN THIS DATA COLLECTION SESSION
gf_histogram( ~absolute_score ,data = df_participants) +
labs(title = "SGC3B Distribution of Absolute Score")
gf_dhistogram( ~absolute_score ,data = df_participants) %>%
gf_facet_wrap(~pretty_condition) +
labs(title = "SGC3B Distribution of Absolute Score (by Condition)")
gf_props(~correct, data = df_items) +
labs(title = "SGC3B Distribution of Item Absolute Score")
gf_props(~correct, data = df_items) %>%
gf_facet_wrap(~pretty_condition) +
labs(title = "SGC3B Distribution of Item Absolute Score (by Condition)")
gf_histogram( ~totaltime_m ,data = df_participants) +
labs(title = "SGC3B Distribution of Total Study Time")
gf_histogram( ~absolute_score ,data = df_participants) %>%
gf_facet_wrap(~pretty_condition) +
labs(title = "SGC3B Distribution of Absolute Score")
gf_histogram(~rt_s, data = df_items) +
labs(title = "SGC3B Distribution of Item Response Time")
gf_jitter(totaltime_m ~ absolute_score , data = df_participants) +
labs(title = "SGC3B Item Response Time vs Accuracy")
For transparency, we save and identify the excluded data.
write.csv(ex_participants,"output/excluded_participants_summer22_sgc3b.csv", row.names = FALSE)
write.csv(ex_items,"output/excluded_items_summer22_sgc3b.csv", row.names = FALSE)
#CSV files
write.csv(df_participants,"output/su22_sgc3b_participants.csv", row.names = FALSE)
write.csv(df_items,"output/su22_sgc3b_items.csv", row.names = FALSE)
#export R DATA STRUCTURES (include codebook metadata)
rio::export(df_participants, "output/su22_sgc3b_participants.rds") # to R data structure file
rio::export(df_items, "output/su22_sgc3b_items.rds") # to R data structure file
NOTE:: INTERPRET WITH CAUTION. THIS FILE DOES NOT INCLUDE DATA FOR THE ENTIRE STUDY, ONLY SUBJECTS RUN IN THIS DATA COLLECTION SESSION
library(ggstatsplot)
## You can cite this package as:
## Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
## Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
ggbetweenstats( y = absolute_score, x = pretty_condition, data = df_participants,
type="nonparametric", var.equal = FALSE)
ggbarstats( y = pretty_condition, x = correct, data = df_items,
type="nonparametric", var.equal = FALSE,
title = "not valid bc items are nested in subject")